“data cleaning”

# Load necessary libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(raster)          #raster()
## Loading required package: sp
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## 
## Attaching package: 'raster'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(sf)              #st_read()
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1; sf_use_s2() is TRUE
library(ggspatial)       #annotation_scale,annotation_north_arrow
library(ggnewscale)      #new_scale_color() 
library(ggsn)            #scalebar()
## Loading required package: grid
## Warning: multiple methods tables found for 'elide'
## 
## Attaching package: 'ggsn'
## 
## The following object is masked from 'package:raster':
## 
##     scalebar
library(shiny)           #Shiny app
## This version of 'bslib' is designed to work with 'shiny' >= 1.6.0.
##     Please upgrade via install.packages('shiny').
library(plotly)          #plot_ly()
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:raster':
## 
##     select
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(gridExtra)       #grid.arrange()
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
# Set working directory
setwd("~/FinalProject")

# Read data files
Unemployrate <-read_csv("/home/rstudio/FinalProject/unemployment_county.csv")
## Rows: 37674 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): County, State
## dbl (2): Unemployment Rate, Year
## num (3): Labor Force, Employed, Unemployed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Unemployrate)
## # A tibble: 6 × 7
##   County       State `Labor Force` Employed Unemployed `Unemployment Rate`  Year
##   <chr>        <chr>         <dbl>    <dbl>      <dbl>               <dbl> <dbl>
## 1 Autauga Cou… AL            24383    23577        806                 3.3  2007
## 2 Baldwin Cou… AL            82659    80099       2560                 3.1  2007
## 3 Barbour Cou… AL            10334     9684        650                 6.3  2007
## 4 Bibb County  AL             8791     8432        359                 4.1  2007
## 5 Blount Coun… AL            26629    25780        849                 3.2  2007
## 6 Bullock Cou… AL             3653     3308        345                 9.4  2007
Crimerate<-read_csv("/home/rstudio/FinalProject/crime_and_incarceration_by_state.csv")
## Rows: 816 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): jurisdiction
## dbl (13): year, prisoner_count, state_population, violent_crime_total, murde...
## lgl  (3): includes_jails, crime_reporting_change, crimes_estimated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Crimerate)
## # A tibble: 6 × 17
##   jurisdiction includes_jails  year prisoner_count crime_reporting_change
##   <chr>        <lgl>          <dbl>          <dbl> <lgl>                 
## 1 FEDERAL      FALSE           2001         149852 NA                    
## 2 ALABAMA      FALSE           2001          24741 FALSE                 
## 3 ALASKA       TRUE            2001           4570 FALSE                 
## 4 ARIZONA      FALSE           2001          27710 FALSE                 
## 5 ARKANSAS     FALSE           2001          11489 FALSE                 
## 6 CALIFORNIA   FALSE           2001         157142 FALSE                 
## # ℹ 12 more variables: crimes_estimated <lgl>, state_population <dbl>,
## #   violent_crime_total <dbl>, murder_manslaughter <dbl>, rape_legacy <dbl>,
## #   rape_revised <dbl>, robbery <dbl>, agg_assault <dbl>,
## #   property_crime_total <dbl>, burglary <dbl>, larceny <dbl>,
## #   vehicle_theft <dbl>
States<-st_read("/home/rstudio/FinalProject/tl_2019_us_state.shp")
## Reading layer `tl_2019_us_state' from data source 
##   `/home/rstudio/FinalProject/tl_2019_us_state.shp' using driver `ESRI Shapefile'
## Simple feature collection with 56 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
## Geodetic CRS:  NAD83
######## Focus on Contiguous USA
Contiguous_state <- States%>% filter(STUSPS!="AK"& 
                                       STUSPS!="AS"& 
                                       STUSPS!="MP"& 
                                       STUSPS!="PR"& 
                                       STUSPS!="VI"& 
                                       STUSPS!="HI"& 
                                       STUSPS!="GU")

###Check the length of states.49 is what we need.

length(unique(Contiguous_state$STUSPS))
## [1] 49
##Check the length of State
unique(Unemployrate $State)
##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN" "IA"
## [16] "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH" "NJ"
## [31] "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT" "VT"
## [46] "VA" "WA" "WV" "WI" "WY"
# Check the length of unique states in Unemployment data
length(unique(Unemployrate $State))
## [1] 50
# Clean Unemployment data
## `summarise()` has grouped output by 'State'. 

Unemployrate <-  Unemployrate %>% filter(State!="AK"& State!="HI") %>%
  group_by(State,Year) %>% 
  summarise(Totalforce=sum(`Labor Force`),
            Totalemployed=sum(Employed),
            Totalunemployed=sum(Unemployed),
            Meanrate=mean(`Unemployment Rate`,rm.na=TRUE)
            )
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
### Check the length of unique states in Unemployment data again
 
length(unique(Unemployrate $State))
## [1] 48
head(Unemployrate)
## # A tibble: 6 × 6
## # Groups:   State [1]
##   State  Year Totalforce Totalemployed Totalunemployed Meanrate
##   <chr> <dbl>      <dbl>         <dbl>           <dbl>    <dbl>
## 1 AL     2007    2175614       2089131           86483     4.9 
## 2 AL     2008    2176491       2053479          123012     6.98
## 3 AL     2009    2163003       1924749          238254    13.2 
## 4 AL     2010    2196053       1964561          231492    12.3 
## 5 AL     2011    2202672       1990414          212258    11.2 
## 6 AL     2012    2176343       2003292          173051     9.25
# Rename column and filter years

colnames(Unemployrate)
## [1] "State"           "Year"            "Totalforce"      "Totalemployed"  
## [5] "Totalunemployed" "Meanrate"
##Change the column name “State” into  "STUSPS" by using rename().Use   filter() to pick out year data from 2007 to 2014.
Unemployrate <- Unemployrate %>% 
  
  filter(Year %in% c(2007:2014))

head(Unemployrate)
## # A tibble: 6 × 6
## # Groups:   State [1]
##   State  Year Totalforce Totalemployed Totalunemployed Meanrate
##   <chr> <dbl>      <dbl>         <dbl>           <dbl>    <dbl>
## 1 AL     2007    2175614       2089131           86483     4.9 
## 2 AL     2008    2176491       2053479          123012     6.98
## 3 AL     2009    2163003       1924749          238254    13.2 
## 4 AL     2010    2196053       1964561          231492    12.3 
## 5 AL     2011    2202672       1990414          212258    11.2 
## 6 AL     2012    2176343       2003292          173051     9.25
unique(Crimerate$jurisdiction)
##  [1] "FEDERAL"        "ALABAMA"        "ALASKA"         "ARIZONA"       
##  [5] "ARKANSAS"       "CALIFORNIA"     "COLORADO"       "CONNECTICUT"   
##  [9] "DELAWARE"       "FLORIDA"        "GEORGIA"        "HAWAII"        
## [13] "IDAHO"          "ILLINOIS"       "INDIANA"        "IOWA"          
## [17] "KANSAS"         "KENTUCKY"       "LOUISIANA"      "MAINE"         
## [21] "MARYLAND"       "MASSACHUSETTS"  "MICHIGAN"       "MINNESOTA"     
## [25] "MISSISSIPPI"    "MISSOURI"       "MONTANA"        "NEBRASKA"      
## [29] "NEVADA"         "NEW HAMPSHIRE"  "NEW JERSEY"     "NEW MEXICO"    
## [33] "NEW YORK"       "NORTH CAROLINA" "NORTH DAKOTA"   "OHIO"          
## [37] "OKLAHOMA"       "OREGON"         "PENNSYLVANIA"   "RHODE ISLAND"  
## [41] "SOUTH CAROLINA" "SOUTH DAKOTA"   "TENNESSEE"      "TEXAS"         
## [45] "UTAH"           "VERMONT"        "VIRGINIA"       "WASHINGTON"    
## [49] "WEST VIRGINIA"  "WISCONSIN"      "WYOMING"
### Check the length of unique states in Crime data

length(unique(Crimerate$jurisdiction))
## [1] 51
head(Crimerate)
## # A tibble: 6 × 17
##   jurisdiction includes_jails  year prisoner_count crime_reporting_change
##   <chr>        <lgl>          <dbl>          <dbl> <lgl>                 
## 1 FEDERAL      FALSE           2001         149852 NA                    
## 2 ALABAMA      FALSE           2001          24741 FALSE                 
## 3 ALASKA       TRUE            2001           4570 FALSE                 
## 4 ARIZONA      FALSE           2001          27710 FALSE                 
## 5 ARKANSAS     FALSE           2001          11489 FALSE                 
## 6 CALIFORNIA   FALSE           2001         157142 FALSE                 
## # ℹ 12 more variables: crimes_estimated <lgl>, state_population <dbl>,
## #   violent_crime_total <dbl>, murder_manslaughter <dbl>, rape_legacy <dbl>,
## #   rape_revised <dbl>, robbery <dbl>, agg_assault <dbl>,
## #   property_crime_total <dbl>, burglary <dbl>, larceny <dbl>,
## #   vehicle_theft <dbl>
# Clean Crime data
###Change the column name 

Crimerate <-  Crimerate %>% 
  rename("STUSPS"="jurisdiction") %>% 
  rename("Year"="year") %>% 
  filter(STUSPS!="FEDERAL"& STUSPS!="ALASKA"& STUSPS!="HAWAII") %>% 
  filter(Year %in% c(2007:2014))

##Recheck the data
length(unique(Crimerate$STUSPS))
## [1] 48
head(Crimerate)
## # A tibble: 6 × 17
##   STUSPS      includes_jails  Year prisoner_count crime_reporting_change
##   <chr>       <lgl>          <dbl>          <dbl> <lgl>                 
## 1 ALABAMA     FALSE           2007          25253 FALSE                 
## 2 ARIZONA     FALSE           2007          37700 FALSE                 
## 3 ARKANSAS    FALSE           2007          13275 FALSE                 
## 4 CALIFORNIA  FALSE           2007         171444 FALSE                 
## 5 COLORADO    FALSE           2007          22666 FALSE                 
## 6 CONNECTICUT TRUE            2007          19438 FALSE                 
## # ℹ 12 more variables: crimes_estimated <lgl>, state_population <dbl>,
## #   violent_crime_total <dbl>, murder_manslaughter <dbl>, rape_legacy <dbl>,
## #   rape_revised <dbl>, robbery <dbl>, agg_assault <dbl>,
## #   property_crime_total <dbl>, burglary <dbl>, larceny <dbl>,
## #   vehicle_theft <dbl>
###Changes the state names in the state column "STUSPS"
Crimerate$STUSPS <- state.abb[match(str_to_title(Crimerate$STUSPS),state.name)]
###Calculate the crimerate
Crimerate <- Crimerate %>% 
  mutate(Crimerate=(violent_crime_total/state_population)*100000) %>% 
  dplyr::mutate_if(is.numeric, round, 1)

Graphs

Time Seies plots

library(plotly)
# Select the states
selected_states <- c("IL", "CA", "ID", "IA")
#head(Unemployrate)

# Time Series plots for Unemployment Rate
# Filter the data for the selected states
Unemployrate_selected <- Unemployrate %>% filter(State %in% selected_states)
head(Unemployrate_selected)
## # A tibble: 6 × 6
## # Groups:   State [1]
##   State  Year Totalforce Totalemployed Totalunemployed Meanrate
##   <chr> <dbl>      <dbl>         <dbl>           <dbl>    <dbl>
## 1 CA     2007   17893090      16931594          961496     6.78
## 2 CA     2008   18178125      16854483         1323642     8.64
## 3 CA     2009   18215141      16182572         2032569    12.4 
## 4 CA     2010   18336283      16091950         2244333    13.8 
## 5 CA     2011   18415107      16258137         2156970    13.4 
## 6 CA     2012   18523796      16602671         1921125    12.1
# Create time series line plot using plotly
fig <- plot_ly()
#for(state in selected_states){
 fig <- plot_ly( Unemployrate_selected, x = ~Year, y=~Meanrate, color=~State, type="scatter", mode="lines") %>% 
   layout(title = "Time Series Visualization for Selected States",
                      xaxis = list(title = "Year"),
                      yaxis = list(title = "Unemployment Rate"))
  fig
# Time Series plots for Crime Rate

Crimerate_selected <- Crimerate %>% filter(STUSPS %in% selected_states)
head(Crimerate_selected)
## # A tibble: 6 × 18
##   STUSPS includes_jails  Year prisoner_count crime_reporting_change
##   <chr>  <lgl>          <dbl>          <dbl> <lgl>                 
## 1 CA     FALSE           2007         171444 FALSE                 
## 2 ID     FALSE           2007           7055 FALSE                 
## 3 IL     FALSE           2007          45215 FALSE                 
## 4 IA     FALSE           2007           8692 FALSE                 
## 5 CA     FALSE           2008         171085 FALSE                 
## 6 ID     FALSE           2008           7033 FALSE                 
## # ℹ 13 more variables: crimes_estimated <lgl>, state_population <dbl>,
## #   violent_crime_total <dbl>, murder_manslaughter <dbl>, rape_legacy <dbl>,
## #   rape_revised <dbl>, robbery <dbl>, agg_assault <dbl>,
## #   property_crime_total <dbl>, burglary <dbl>, larceny <dbl>,
## #   vehicle_theft <dbl>, Crimerate <dbl>
fig2 <- plot_ly()
#for(state in selected_states){
 fig2 <- plot_ly( Crimerate_selected, x = ~Year, y=~Crimerate, color=~STUSPS , type="scatter", mode="lines") %>% 
   layout(title = "Time Series Visualization for Selected States",
                      xaxis = list(title = "Year"),
                      yaxis = list(title = "Crime Rate"))
  fig2

scatter plot

# Scatter plot for the year 2014
Unemployrate_2014 <- Unemployrate %>% filter(Year == 2014)
Crimerate_2014 <- Crimerate %>% filter(Year == 2014)
# Join datasets for the year 2014

Crimerate <- Crimerate %>% rename(State = STUSPS)
colnames(Crimerate_2014)
##  [1] "STUSPS"                 "includes_jails"         "Year"                  
##  [4] "prisoner_count"         "crime_reporting_change" "crimes_estimated"      
##  [7] "state_population"       "violent_crime_total"    "murder_manslaughter"   
## [10] "rape_legacy"            "rape_revised"           "robbery"               
## [13] "agg_assault"            "property_crime_total"   "burglary"              
## [16] "larceny"                "vehicle_theft"          "Crimerate"
Crimerate_2014 <- Crimerate_2014 %>% rename(State = STUSPS)


colnames(Unemployrate_2014)
## [1] "State"           "Year"            "Totalforce"      "Totalemployed"  
## [5] "Totalunemployed" "Meanrate"
colnames(Crimerate_2014)
##  [1] "State"                  "includes_jails"         "Year"                  
##  [4] "prisoner_count"         "crime_reporting_change" "crimes_estimated"      
##  [7] "state_population"       "violent_crime_total"    "murder_manslaughter"   
## [10] "rape_legacy"            "rape_revised"           "robbery"               
## [13] "agg_assault"            "property_crime_total"   "burglary"              
## [16] "larceny"                "vehicle_theft"          "Crimerate"
data_2014 <- left_join(Unemployrate_2014, Crimerate_2014, by = c("State", "Year"))
# Scatter plot for the relationship between Unemployment Rate and Crime Rate in 2014

fig <- plot_ly(data_2014, x = ~Crimerate, y = ~Meanrate, type = "scatter", mode = "markers", color = ~State) %>%
  layout(title = "Relationship between Unemployment Rate and Crime Rate in 2014",
         xaxis = list(title = "Crime Rate"),
         yaxis = list(title = "Unemployment Rate"))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

spatial plot

# Spatial plot for Unemployment Rate in 2014
# Join the data
data_2014 <- left_join(Contiguous_state, Unemployrate_2014, by = c("STUSPS" = "State"))

# Create the plot
p <- ggplot() +
  geom_sf(data = data_2014, aes(fill = Meanrate)) +
  labs(title = "Unemployment Rate in 2014", 
       fill = "Unemployment Rate",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal() +
  coord_sf(datum = NA) +
  north(data_2014, scale = 0.1, symbol = 3) +
  scalebar(data_2014, dist = 100, dist_unit = "km", transform = TRUE)

# Print the plot
print(p)

# Spatial plot for Crime Rate in 2014

p2 <- ggplot() +
  geom_sf(data = data_2014, aes(fill = Crimerate)) +
  labs(title = "Crime Rate in 2014", fill = "Crime Rate") +
  theme_minimal()

data_2014 <- left_join(Contiguous_state, Crimerate_2014, by = c("STUSPS" = "State"))

p2 <- ggplot() +
  geom_sf(data = data_2014, aes(fill = Crimerate)) +
  labs(title = "Crime Rate in 2014", fill = "Crime Rate",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal()

p2 <- p2 + north(data_2014) + scalebar(data_2014, dist = 100, dist_unit = "km", transform = TRUE)

# Print the plots
print(p2)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.